import jieba
import time

t1=time.time()


data_path="./sources/weibo_senti_100k.csv"
voc_list={}
data_stop_path="./sources/hit_stopword.txt"
stop_list=[]

with open(data_stop_path,encoding="utf-8") as f:
    stop_list=f.readlines()
    stop_list=[line.strip() for line in stop_list]
    stop_list.append(" ")
    stop_list.append("\n")
with open(data_path,encoding="utf-8") as f:
    data_list=f.readlines()[1:]
    for item in data_list:
        labels=item[0]
        content=item[2:].strip()
        seq_list=jieba.cut(content,cut_all=False)
        for seq_item in seq_list:
            if seq_item in stop_list:continue
            if seq_item in voc_list.keys():
                voc_list[seq_item]=voc_list[seq_item]+1
            else:
                voc_list[seq_item]=1
min_seq=1
top_n=1000
UNK="<UNK>"
PAD="<PAD>"
voc_list=[item for item in voc_list.items() if item[1]>min_seq]
voc_list=sorted(voc_list,key=lambda x:x[1],reverse=True)
voc_list=voc_list[:top_n]
voc_dict={item[0]:idx for idx,item in enumerate(voc_list)}
voc_dict.update({
    UNK:len(voc_dict),
    PAD:len(voc_dict)+1,
})
with open("sources/voc_dict","w",encoding="utf-8") as f:
    for key in voc_dict.keys():
        f.writelines("{},{}\n".format(key,voc_dict[key]))

# print(voc_dict)
print((time.time()-t1)/60,"min")